MÉTODOS DE CLASIFICACIÓN SUPERVISADA
ANÁLISIS DE DISCRIMINANTE LINEAL (LDA)
# No se cumple normalidad ni homocedasticidad. No obstante, analizo su rendimiento.
# Defino modelo con variables numéricas
set.seed(1)
task = makeClassifTask(data = datos_tr[-2], target = "diagnosis")
lrn = makeLearner("classif.lda", predict.type = "prob")
mod1 = mlr::train(lrn, task)
# Predicción del TEST / Accuracy, AUC (ROC)
pred_lda= predict(mod1, newdata = datos_te[-2])
acc_lda1= round(measureACC(as.data.frame(pred_lda)$truth, as.data.frame(pred_lda)$response),3)
AUC_lda_te <- round(measureAUC(as.data.frame(pred_lda)$prob.maligno,as.data.frame(pred_lda)$truth,'normal','maligno'),3)
# ···························································································
# Predicción del TRAIN (ingenua o naive) / Accuracy, AUC (ROC)
pred_lda2 = predict(mod1, newdata = datos_tr[-2]) # por si quiero ver naive sobre training
acc_lda2 = round(measureACC(as.data.frame(pred_lda2)$truth, as.data.frame(pred_lda2)$response),3)
AUC_lda_tr <- round(measureAUC(as.data.frame(pred_lda2)$prob.maligno,as.data.frame(pred_lda2)$truth, 'normal','maligno'),3)
# ···························································································
# Cambio el threshold [esto lo hago para train y test] y grafico para ver cuál elegiría si la métrica que me interesa es Accuracy, por ejemplo
acc=NULL
acc2=NULL
threshold = seq(0.1,0.95,0.01)
for (i in 1:length(threshold)) {
pred = setThreshold(pred_lda, threshold = threshold[i])
acc[i] = measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response)}
for (i in 1:length(threshold)) {
pred2 = setThreshold(pred_lda2, threshold = threshold[i])
acc2[i] = measureACC(as.data.frame(pred2)$truth, as.data.frame(pred2)$response)}
new_df1 <- as.data.frame(cbind(threshold,acc))
new_df1 <- new_df1%>%mutate(sub_data='test')
new_df2 <- as.data.frame(cbind(threshold,acc2))
colnames(new_df2) <- c('threshold','acc')
new_df2 <- new_df2%>%mutate(sub_data='train')
new_df <- as.data.frame(rbind(new_df1,new_df2))
# new_df1[which.max(new_df1$acc),"threshold"] # 0.47 (máximo de test)
# new_df2[which.max(new_df2$acc),"threshold"] # 0.52 (máximo de train)
# ···························································································
# Gráfico de cómo varía la métrica de performance accuracy, de acuerdo al umbral elegido
ggplot(new_df, aes(x=threshold, y=acc)) +
geom_line(aes(color = sub_data,linetype=sub_data)) +
theme +
labs(x='Umbral', y='Métrica de performance (accuracy)',
title= 'Evaluación del modelo de discriminante lineal LDA') +
scale_color_manual(values = c("red", "darkred"),labels=c('prueba','entrenamiento')) +
scale_linetype_manual(values=c(1,2), labels=c('prueba','entrenamiento')) +
labs(color='Conjunto de\n evaluación',linetype='Conjunto de\n evaluación')

# Para independizarnos de la elección del umbral, grafico curvas ROC para las predicciones del modelo LDA con los datos de TEST y TRAIN
df_lda = generateThreshVsPerfData(list(lda_te = pred, lda_tr = pred2),
measures = list(fpr, tpr, mmce))
plotROCCurves(df_lda) + theme +
labs(title='Curva ROC del modelo discriminante lineal LDA', x='Tasa de falsos positivos (FPR)',
y='Tasa de positivos verdaderos (TPR)', color='Conjunto de\n evaluación') +
scale_color_manual(values = c("red", "darkred"), labels=c('prueba','entrenamiento')) +
geom_label(label="AUC= 0.860", x=0.35, y=0.75, label.size = 0.3,
size=4,color = "red",fill="white") +
geom_label(label="AUC= 0.910", x=0.068, y=0.95, label.size = 0.3,
size=4,color = "darkred",fill="white")

# ················ Métricas del modelo de LDA ················
Métrica <- c('valor','datos')
Accuracy <- c(acc_lda1,'prueba')
Accuracy. <- c(acc_lda2,'entrenamiento')
AUC_ROC <- c(AUC_lda_te,'prueba')
AUC_ROC. <- c(AUC_lda_tr,'entrenamiento')
# Imprimo los resultados de métricas de accuracy y AUC para el conjunto de test (Prueba) o train (Entrenamiento)
kable(rbind(Métrica, Accuracy, Accuracy., AUC_ROC, AUC_ROC.))
| Métrica |
valor |
datos |
| Accuracy |
0.809 |
prueba |
| Accuracy. |
0.853 |
entrenamiento |
| AUC_ROC |
0.86 |
prueba |
| AUC_ROC. |
0.91 |
entrenamiento |
ANÁLISIS DE DISCRIMINANTE CUADRÁTICO (QDA)
# Como no se cumple homocedasticidad, tiene más sentido aplicar QDA. No obstante, no se cumple normalidad, que es un supuesto de QDA
# Defino modelo
set.seed(1)
task = makeClassifTask(data = datos_tr[-2], target = "diagnosis")
lrn2 = makeLearner("classif.qda", predict.type = "prob")
mod2 = mlr::train(lrn2, task)
# Predicción sobre TEST
pred_qda = predict(mod2, newdata = datos_te[-2])
acc_qda1 <- round(measureACC(as.data.frame(pred_qda)$truth, as.data.frame(pred_qda)$response),3)
AUC_qda_te <- round(measureAUC(as.data.frame(pred_qda)$prob.maligno,as.data.frame(pred_qda)$truth,'normal','maligno'),3)
# ···························································································
# Predicción sobre TRAIN (ingenua)
pred_qda2 = predict(mod2, newdata = datos_tr[-2]) # por si quiero ver naive sobre training
acc_qda2 <- round(measureACC(as.data.frame(pred_qda2)$truth, as.data.frame(pred_qda2)$response),3)
AUC_qda_tr <- round(measureAUC(as.data.frame(pred_qda2)$prob.maligno,as.data.frame(pred_qda2)$truth, 'normal','maligno'),3)
# ···························································································
# Cambio el threshold [para las métricas definidas con un umbral como accuracy]
acc=NULL
acc2=NULL
threshold = seq(0.1,0.95,0.01)
for (i in 1:length(threshold)) {
pred = setThreshold(pred_qda, threshold = threshold[i])
acc[i] = measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response)}
for (i in 1:length(threshold)) {
pred2 = setThreshold(pred_qda2, threshold = threshold[i])
acc2[i] = measureACC(as.data.frame(pred2)$truth, as.data.frame(pred2)$response)}
new_df1 <- as.data.frame(cbind(threshold,acc))
new_df1 <- new_df1%>%mutate(sub_data='test')
new_df2 <- as.data.frame(cbind(threshold,acc2))
colnames(new_df2) <- c('threshold','acc')
new_df2 <- new_df2%>%mutate(sub_data='train')
new_df <- as.data.frame(rbind(new_df1,new_df2))
#new_df1[which.max(new_df1$acc),"threshold"] # test 0.68
#new_df2[which.max(new_df2$acc),"threshold"] # train 0.8
# ···························································································
# Gráfico de cómo varía la métrica de performance accuracy, de acuerdo al umbral elegido
ggplot(new_df, aes(x=threshold, y=acc)) + geom_line(aes(color = sub_data,linetype=sub_data)) + theme +
labs(x='Umbral', y='Métrica de performance (accuracy)',
title= 'Evaluación del modelo de discriminante cuadrático QDA') +
scale_color_manual(values = c("red", "darkred"),labels=c('prueba','entrenamiento')) +
scale_linetype_manual(values=c(1,2), labels=c('prueba','entrenamiento')) +
labs(color='Conjunto de\n evaluación',linetype='Conjunto de\n evaluación')

# Para independizarnos de la elección del umbral, grafico curvas ROC para las predicciones del modelo QDA con los datos de TEST y TRAIN
df_qda = generateThreshVsPerfData(list(qda_te = pred_qda, qda_tr = pred_qda2),
measures = list(fpr, tpr, mmce))
plotROCCurves(df_qda) + theme + labs(title='Curva ROC del modelo discriminante cuadrático QDA',
x='Tasa de falsos positivos (FPR)',
y='Tasa de positivos verdaderos (TPR)',
color='Conjunto de\n evaluación') +
scale_color_manual(values = c("red", "darkred"), labels=c('prueba','entrenamiento'))+
geom_label(label="AUC= 0.929", x=0.35, y=0.75, label.size = 0.3, size=4,
color = "red",fill="white")+
geom_label(label="AUC= 0.923", x=0.068, y=0.95, label.size = 0.3, size=4,
color = "darkred",fill="white")

# ················ Métricas del modelo de QDA ················
Métrica <- c('valor','datos')
Accuracy <- c(acc_qda1,'prueba')
Accuracy. <- c(acc_qda2,'entrenamiento')
AUC_ROC <- c(AUC_qda_te,'prueba')
AUC_ROC. <- c(AUC_qda_tr,'entrenamiento')
#Imprimo resultados
kable(rbind(Métrica, Accuracy, Accuracy., AUC_ROC, AUC_ROC.))
| Métrica |
valor |
datos |
| Accuracy |
0.817 |
prueba |
| Accuracy. |
0.808 |
entrenamiento |
| AUC_ROC |
0.929 |
prueba |
| AUC_ROC. |
0.923 |
entrenamiento |
DISCRIMINANTE CUADRÁTICO ROBUSTO
#estimaciones robustas MCD
cov_normal=cov.rob(data[data$diagnosis=="normal",c(3:7)],method="mcd",nsamp="best")
cov_maligno=cov.rob(data[data$diagnosis=="maligno",c(3:7)],method="mcd",nsamp="best")
prom_normal=rep(cov_normal$center,183)
prom_maligno=rep(cov_maligno$center,198)
var_normal=as.matrix (cov_normal$cov)
var_maligno=as.matrix (cov_maligno$cov)
data2 <- data%>%dplyr::select(3:7)
data2N <- data2-prom_normal
data2M <- data2-prom_maligno
# Calcula las distancias de Mahalanobis robustas
distrob_normal= as.matrix(data2N) %*% solve(var_normal) %*% t(as.matrix (data2N))
distrob_maligna= as.matrix(data2M) %*% solve(var_maligno) %*% t(as.matrix (data2M))
clase=0
for (i in 1:381) {ifelse(distrob_normal[i]<distrob_maligna[i], clase[i] <- 'normal', clase[i] <- 'maligno')}
# Clasifica con las distancias
print('Matriz de confusión')
## [1] "Matriz de confusión"
table(data$diagnosis,clase)
## clase
## maligno normal
## normal 55 128
## maligno 79 119
valor <- round(measureACC(data$diagnosis,clase),3)
# Imprimo resultados
kable(cbind('Accuracy', valor))
ANÁLISIS DISCRIMINANTE REGULARIZADO (RDA)
# Este método es más robusto ante colinealidad
set.seed(1)
task = makeClassifTask(data = datos_tr[-2], target = "diagnosis")
lrn3 = mlr::makeLearner("classif.rda", predict.type = "prob")
mod3 = mlr::train(lrn3, task)
pred_rda =predict(mod3, newdata = datos_te[-2])
# Predicción en TEST
acc_rda1 <- round(measureACC(as.data.frame(pred_rda)$truth, as.data.frame(pred_rda)$response),3)
AUC_rda_te <- round(measureAUC(as.data.frame(pred_rda)$prob.maligno,as.data.frame(pred_rda)$truth,'normal','maligno'),3)
# ···························································································
# Predicción en TRAIN
pred_rda2 = predict(mod3, newdata = datos_tr[-2]) # por si quiero ver naive sobre training
acc_rda2 <- round(measureACC(as.data.frame(pred_rda2)$truth, as.data.frame(pred_rda2)$response),3)
AUC_rda_tr <- round(measureAUC(as.data.frame(pred_rda2)$prob.maligno,as.data.frame(pred_rda2)$truth, 'normal','maligno'),3)
# ···························································································
# Cambio el threshold (para las métricas como accuracy, que depende de un umbral)
acc=NULL
acc2=NULL
threshold = seq(0.1,0.95,0.01)
for (i in 1:length(threshold)) {
pred = setThreshold(pred_rda, threshold = threshold[i])
acc[i] = measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response)}
for (i in 1:length(threshold)) {
pred2 = setThreshold(pred_rda2, threshold = threshold[i])
acc2[i] = measureACC(as.data.frame(pred2)$truth, as.data.frame(pred2)$response)}
new_df1 <- as.data.frame(cbind(threshold,acc))
new_df1 <- new_df1%>%mutate(sub_data='test')
new_df2 <- as.data.frame(cbind(threshold,acc2))
colnames(new_df2) <- c('threshold','acc')
new_df2 <- new_df2%>%mutate(sub_data='train')
new_df <- as.data.frame(rbind(new_df1,new_df2))
#new_df1[which.max(new_df1$acc),"threshold"] # test 0.48
#new_df2[which.max(new_df2$acc),"threshold"] # train 0.56
# ···························································································
# Gráfico de cómo varía la métrica de performance accuracy, de acuerdo al umbral elegido
ggplot(new_df, aes(x=threshold, y=acc)) + geom_line(aes(color = sub_data,linetype=sub_data)) +
theme + labs(x='Umbral', y='Métrica de performance (accuracy)',
title= 'Evaluación del modelo de discriminante regularizado RDA') +
scale_color_manual(values = c("red", "darkred"),labels=c('prueba','entrenamiento')) +
scale_linetype_manual(values=c(1,2), labels=c('prueba','entrenamiento')) +
labs(color='Conjunto de\n evaluación',linetype='Conjunto de\n evaluación')

# Para independizarnos de la elección del umbral, grafico curvas ROC para las predicciones del modelo RDA con los datos de TEST y TRAIN
df_rda = generateThreshVsPerfData(list(rda_te = pred_rda, rda_tr = pred_rda2),
measures = list(fpr, tpr, mmce))
plotROCCurves(df_rda) + theme +
labs(title='Curva ROC del modelo discriminante regularizado RDA',
x='Tasa de falsos positivos (FPR)', y='Tasa de positivos verdaderos (TPR)',
color='Conjunto de\n evaluación') +
scale_color_manual(values = c("red", "darkred"), labels=c('prueba','entrenamiento')) +
geom_label(label="AUC= 0.858", x=0.35, y=0.75, label.size = 0.3, size=4,
color = "red",fill="white") +
geom_label(label="AUC= 0.914", x=0.068, y=0.95, label.size = 0.3, size=4,
color = "darkred",fill="white")

# ················ Métricas del modelo de RDA ················
Métrica <- c('valor','datos')
Accuracy <- c(acc_rda1,'prueba')
Accuracy. <- c(acc_rda2,'entrenamiento')
AUC_ROC <- c(AUC_rda_te,'prueba')
AUC_ROC. <- c(AUC_rda_tr,'entrenamiento')
# Imprimo resultados
kable(rbind(Métrica, Accuracy, Accuracy., AUC_ROC, AUC_ROC.))
| Métrica |
valor |
datos |
| Accuracy |
0.791 |
prueba |
| Accuracy. |
0.853 |
entrenamiento |
| AUC_ROC |
0.858 |
prueba |
| AUC_ROC. |
0.914 |
entrenamiento |
REGRESIÓN LOGÍSTICA
# chequeo el balance de las distintas clases de la variable diagnóstico en el conjunto de todos los datos, los datos de prueba y los datos de entramiento.
Entrenamiento <- table(datos_tr$diagnosis) #126 normal, 140 maligno
Prueba <- table(datos_te$diagnosis) # 57 normal, 58 maligno
Total <- table(data$diagnosis) # 183 normal, 198 maligno
kable(rbind(Entrenamiento, Prueba,Total))
| Entrenamiento |
126 |
140 |
| Prueba |
57 |
58 |
| Total |
183 |
198 |
# Armo modelo de regresión logistica.
set.seed(1)
task = makeClassifTask(data = datos_tr[-2], target = "diagnosis")
lrn = makeLearner("classif.logreg", predict.type = "prob")
mod_lr = mlr::train(lrn, task)
# Predicción en TEST
pred_lr= predict(mod_lr, newdata = datos_te[-2])
acc_lg1 <- round(measureACC(as.data.frame(pred_lr)$truth, as.data.frame(pred_lr)$response),3)
AUC_lg_te <- round(measureAUC(as.data.frame(pred_lr)$prob.maligno,as.data.frame(pred_lr)$truth,'normal','maligno'),3)
# ···························································································
# Predicción en TRAIN
pred_lr2 = predict(mod_lr, newdata = datos_tr[-2]) # por si quiero ver naive sobre training
acc_lg2 <- round(measureACC(as.data.frame(pred_lr2)$truth, as.data.frame(pred_lr2)$response),3)
AUC_lg_tr <- round(measureAUC(as.data.frame(pred_lr2)$prob.maligno,as.data.frame(pred_lr2)$truth, 'normal','maligno'),3)
# ···························································································
# Cambio el threshold [esto lo hago para train y test]
acc=NULL
acc2=NULL
threshold = seq(0.1,0.95,0.01)
for (i in 1:length(threshold)) {
pred = setThreshold(pred_lr, threshold = threshold[i])
acc[i] = measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response)}
for (i in 1:length(threshold)) {
pred2 = setThreshold(pred_lr2, threshold = threshold[i])
acc2[i] = measureACC(as.data.frame(pred2)$truth, as.data.frame(pred2)$response)}
par(mfcol = c(1,2))
new_df1 <- as.data.frame(cbind(threshold,acc))
new_df1 <- new_df1%>%mutate(sub_data='test')
new_df2 <- as.data.frame(cbind(threshold,acc2))
colnames(new_df2) <- c('threshold','acc')
new_df2 <- new_df2%>%mutate(sub_data='train')
new_df <- as.data.frame(rbind(new_df1,new_df2))
#new_df1[which.max(new_df1$acc),"threshold"] # test 0.33
#new_df2[which.max(new_df2$acc),"threshold"] # train 0.53
# Para independizarnos de la elección del umbral, grafico curvas ROC para las predicciones del modelo LR con los datos de TEST y TRAIN
df_lr = generateThreshVsPerfData(list(lgr_te = pred_lr, lgr_tr = pred_lr2),
measures = list(fpr, tpr, mmce))
plotROCCurves(df_lr) + theme +
labs(title='Curva ROC del modelo regresión logística', x='Tasa de falsos positivos (FPR)',
y='Tasa de positivos verdaderos (TPR)', color='Conjunto de\n evaluación') +
scale_color_manual(values = c("red", "darkred"), labels=c('prueba','entrenamiento')) +
geom_label(label="AUC= 0.889", x=0.35, y=0.75, label.size = 0.3, size=4,
color = "red",fill="white") +
geom_label(label="AUC= 0.925", x=0.07, y=0.97, label.size = 0.3, size=4,
color = "darkred",fill="white")

# ················ Métricas del modelo de Regresión Logística ················
Métrica <- c('valor','datos')
Accuracy <- c(acc_lg1,'prueba')
Accuracy. <- c(acc_lg2,'entrenamiento')
AUC_ROC <- c(AUC_lg_te,'prueba')
AUC_ROC. <- c(AUC_lg_tr,'entrenamiento')
# Imprimo los resultados de performance
kable(rbind(Métrica, Accuracy, Accuracy., AUC_ROC, AUC_ROC.))
| Métrica |
valor |
datos |
| Accuracy |
0.817 |
prueba |
| Accuracy. |
0.872 |
entrenamiento |
| AUC_ROC |
0.889 |
prueba |
| AUC_ROC. |
0.925 |
entrenamiento |
# Impresión de los coeficientes de regresión logística
kable(mod_lr$learner.model$coefficients)
| (Intercept) |
1.2225674 |
| edad |
0.8237644 |
| creatinina |
-0.7279764 |
| LYVE1 |
1.5382596 |
| REG1B |
0.6783715 |
| TFF1 |
2.5387519 |
# ···························································································
# Cambio el threshold [esto lo hago para train y test]
recall=NULL
precision=NULL
threshold = seq(0.1,0.95,0.01)
for (i in 1:length(threshold)) {
pred = setThreshold(pred_lr2, threshold = threshold[i])
recall[i] = measureTPR(as.data.frame(pred)$truth, as.data.frame(pred)$response,'maligno')} #recall
for (i in 1:length(threshold)) {
pred2 = setThreshold(pred_lr2, threshold = threshold[i])
precision[i] = measurePPV(as.data.frame(pred2)$truth, as.data.frame(pred2)$response,'maligno',probabilities = NULL)}#precision
par(mfcol = c(1,2))
new_df1 <- as.data.frame(cbind(threshold,recall))
colnames(new_df1) <- c('threshold','metric')
new_df1 <- new_df1%>%mutate(sub_data='recall')
new_df2 <- as.data.frame(cbind(threshold,precision))
colnames(new_df2) <- c('threshold','metric')
new_df2 <- new_df2%>%mutate(sub_data='precision')
new_dfa <- as.data.frame(rbind(new_df1,new_df2))
# ···························································································
curve1 <- new_df1%>%dplyr::select(1,2)
curve2 <- new_df2%>%dplyr::select(1,2)
colnames(curve1) <- c('x','y')
colnames(curve2) <- c('x','y')
# threshold en el que se intersectan las curvas de recall y precision (sensibilidad y especificidad, respectivamente)
thr=curve_intersect(curve1, curve2 , empirical = TRUE, domain = NULL)$x
# Gráfico de cómo varía la métrica de performance recall y precision, de acuerdo al umbral elegido
ggplot(new_dfa, aes(x=threshold, y=metric)) + geom_line(aes(color = sub_data,linetype=sub_data)) +
theme + labs(x='Umbral', y='Métrica de performance',
title= 'Evaluación del modelo de regresión logística en subconjunto de prueba') +
scale_color_manual(values = c("red", "LightSeaGreen"),labels=c('recall (TPR)','precision (PPV)')) +
scale_linetype_manual(values=c(1,2), labels=c('recall (TPR)','precision (PPV)')) +
labs(color='Métrica',linetype='Métrica')+
geom_vline(xintercept = thr, linetype=3, color='darkgray')

# ···························································································
# Gráfico de cómo varía la métrica de performance accuracy, de acuerdo al umbral elegido
ggplot(new_df, aes(x=threshold, y=acc)) + geom_line(aes(color = sub_data,linetype=sub_data)) +
theme + labs(x='Umbral', y='Métrica de performance (accuracy)',
title= 'Evaluación del modelo de regresión logística') +
scale_color_manual(values = c("red", "darkred"),labels=c('prueba','entrenamiento')) +
scale_linetype_manual(values=c(1,2), labels=c('prueba','entrenamiento')) +
labs(color='Conjunto de\n evaluación',linetype='Conjunto de\n evaluación')+
geom_vline(xintercept = thr, linetype=3, color='darkgray')

#seteo el threshold (x) en donde la curva de recall y la de precision se cruzan
# calculo AUC, accuracy, recall y precision con ese threshold.
prediccion_threshold <- setThreshold(pred_lr, thr)
AUC_lg_tethreshold <- round(measureAUC(as.data.frame(prediccion_threshold)$prob.maligno,as.data.frame(prediccion_threshold)$truth,'normal','maligno'),3) #0.889 ....> y sí! no depende del threshold
accuracy_con_threshold <- round(measureACC(as.data.frame(prediccion_threshold)$truth, as.data.frame(prediccion_threshold)$response),3)
recall_con_threshold <- round(measureTPR(as.data.frame(prediccion_threshold)$truth, as.data.frame(prediccion_threshold)$response, 'maligno'),3)
precision_con_threshold <- round(measurePPV(as.data.frame(prediccion_threshold)$truth, as.data.frame(prediccion_threshold)$response, 'maligno'),3)
MÁQUINAS DE SOPORTE VECTORIAL (SVM) con kernel lineal
# Defino modelo SVM
set.seed(1)
task = makeClassifTask(data = datos_tr[-2], target = "diagnosis")
lrn_svm = makeLearner("classif.svm", predict.type = "prob", par.vals = list( kernel = "linear", cost=2))
mod_svm = mlr::train(lrn_svm, task)
# Predicción TEST
pred_svm= predict(mod_svm, newdata = datos_te[-2])
acc_svm1 <- round(measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response),3)
AUC_svm_te <- round(measureAUC(as.data.frame(pred_svm)$prob.maligno,as.data.frame(pred_svm)$truth,'normal','maligno'),3)
# ···························································································
# Predicción TRAIN (naive)
pred_svm2 = predict(mod_lr, newdata = datos_tr[-2]) # por si quiero ver naive sobre training
acc_svm2 <- round(measureACC(as.data.frame(pred_svm2)$truth, as.data.frame(pred_svm2)$response),3)
AUC_svm_tr <- round(measureAUC(as.data.frame(pred_svm2)$prob.maligno,as.data.frame(pred_svm2)$truth, 'normal','maligno'),3)
# ···························································································
# Cambio el threshold [esto lo hago para train y test]
acc=NULL
acc2=NULL
threshold = seq(0.1,0.95,0.01)
for (i in 1:length(threshold)) {
pred = setThreshold(pred_svm, threshold = threshold[i])
acc[i] = measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response)}
for (i in 1:length(threshold)) {
pred2 = setThreshold(pred_svm2, threshold = threshold[i])
acc2[i] = measureACC(as.data.frame(pred2)$truth, as.data.frame(pred2)$response)}
par(mfcol = c(1,2))
new_df1 <- as.data.frame(cbind(threshold,acc))
new_df1 <- new_df1%>%mutate(sub_data='test')
new_df2 <- as.data.frame(cbind(threshold,acc2))
colnames(new_df2) <- c('threshold','acc')
new_df2 <- new_df2%>%mutate(sub_data='train')
new_df <- as.data.frame(rbind(new_df1,new_df2))
#new_df1[which.max(new_df1$acc),"threshold"] # test 0.39
#new_df2[which.max(new_df2$acc),"threshold"] # train 0.53
# ···························································································
# Gráfico de cómo varía la métrica de performance accuracy, de acuerdo al umbral elegido
ggplot(new_df, aes(x=threshold, y=acc)) + geom_line(aes(color = sub_data,linetype=sub_data)) +
theme +labs(x='Umbral', y='Métrica de performance (accuracy)',
title= 'Evaluación del modelo de Máquinas de soporte vectorial SVM') +
scale_color_manual(values = c("red", "darkred"),labels=c('prueba','entrenamiento')) +
scale_linetype_manual(values=c(1,2), labels=c('prueba','entrenamiento')) +
labs(color='Conjunto de\n evaluación',linetype='Conjunto de\n evaluación')

# Para independizarnos de la elección del umbral, grafico curvas ROC para las predicciones del modelo SVM con los datos de TEST y TRAIN
df_svm = generateThreshVsPerfData(list(svm_te = pred_svm, svm_tr = pred_svm2),
measures = list(fpr, tpr, mmce))
plotROCCurves(df_svm) + theme +
labs(title='Curva ROC del modelo de Máquinas de soporte vectorial SVM kernel lineal',
x='Tasa de falsos positivos (FPR)', y='Tasa de positivos verdaderos (TPR)',
color='Conjunto de\n evaluación') +
scale_color_manual(values = c("red", "darkred"), labels=c('prueba','entrenamiento')) +
geom_label(label="AUC= 0.894", x=0.35, y=0.75, label.size = 0.3, size=4,
color = "red",fill="white") +
geom_label(label="AUC= 0.925", x=0.07, y=0.97, label.size = 0.3, size=4,
color = "darkred",fill="white")

# ················ Métricas del modelo de SVM ················
Métrica <- c('valor','datos')
Accuracy <- c(acc_svm1,'prueba')
Accuracy. <- c(acc_svm2,'entrenamiento')
AUC_ROC <- c(AUC_svm_te,'prueba')
AUC_ROC. <- c(AUC_svm_tr,'entrenamiento')
# Imprimo resultados
kable(rbind(Métrica, Accuracy, Accuracy., AUC_ROC, AUC_ROC.))
| Métrica |
valor |
datos |
| Accuracy |
0.583 |
prueba |
| Accuracy. |
0.872 |
entrenamiento |
| AUC_ROC |
0.894 |
prueba |
| AUC_ROC. |
0.925 |
entrenamiento |
MÁQUINAS DE SOPORTE VECTORIAL (SVM) con kernel sigmoideo
# Defino modelo SVM
set.seed(1)
task = makeClassifTask(data = datos_tr[-2], target = "diagnosis")
lrn_svm = makeLearner("classif.svm", predict.type = "prob", par.vals = list( kernel = "sigmoid", cost=2))
mod_svm = mlr::train(lrn_svm, task)
# Predicción TEST
pred_svm= predict(mod_svm, newdata = datos_te[-2])
acc_svm1 <- round(measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response),3)
AUC_svm_te <- round(measureAUC(as.data.frame(pred_svm)$prob.maligno,as.data.frame(pred_svm)$truth,'normal','maligno'),3)
# ···························································································
# Predicción TRAIN (naive)
pred_svm2 = predict(mod_lr, newdata = datos_tr[-2])
acc_svm2 <- round(measureACC(as.data.frame(pred_svm2)$truth, as.data.frame(pred_svm2)$response),3)
AUC_svm_tr <- round(measureAUC(as.data.frame(pred_svm2)$prob.maligno,as.data.frame(pred_svm2)$truth, 'normal','maligno'),3)
# ···························································································
# Cambio el threshold [esto lo hago para train y test]
acc=NULL
acc2=NULL
threshold = seq(0.1,0.95,0.01)
for (i in 1:length(threshold)) {
pred = setThreshold(pred_svm, threshold = threshold[i])
acc[i] = measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response)}
for (i in 1:length(threshold)) {
pred2 = setThreshold(pred_svm2, threshold = threshold[i])
acc2[i] = measureACC(as.data.frame(pred2)$truth, as.data.frame(pred2)$response)}
par(mfcol = c(1,2))
new_df1 <- as.data.frame(cbind(threshold,acc))
new_df1 <- new_df1%>%mutate(sub_data='test')
new_df2 <- as.data.frame(cbind(threshold,acc2))
colnames(new_df2) <- c('threshold','acc')
new_df2 <- new_df2%>%mutate(sub_data='train')
new_df <- as.data.frame(rbind(new_df1,new_df2))
#new_df1[which.max(new_df1$acc),"threshold"] # test 0.39
#new_df2[which.max(new_df2$acc),"threshold"] # train 0.53
# ···························································································
# Gráfico de cómo varía la métrica de performance accuracy, de acuerdo al umbral elegido
ggplot(new_df, aes(x=threshold, y=acc)) + geom_line(aes(color = sub_data,linetype=sub_data)) +
theme +labs(x='Umbral', y='Métrica de performance (accuracy)',
title= 'Evaluación del modelo de Máquinas de soporte vectorial SVM') +
scale_color_manual(values = c("red", "darkred"),labels=c('prueba','entrenamiento')) +
scale_linetype_manual(values=c(1,2), labels=c('prueba','entrenamiento')) +
labs(color='Conjunto de\n evaluación',linetype='Conjunto de\n evaluación')

# Para independizarnos de la elección del umbral, grafico curvas ROC para las predicciones del modelo SVM con los datos de TEST y TRAIN
df_svm = generateThreshVsPerfData(list(svm_te = pred_svm, svm_tr = pred_svm2),
measures = list(fpr, tpr, mmce))
plotROCCurves(df_svm) + theme +
labs(title='Curva ROC del modelo de Máquinas de soporte vectorial SVM kernel radial',
x='Tasa de falsos positivos (FPR)', y='Tasa de positivos verdaderos (TPR)',
color='Conjunto de\n evaluación') +
scale_color_manual(values = c("red", "darkred"), labels=c('prueba','entrenamiento')) +
geom_label(label="AUC= 0.855", x=0.35, y=0.75, label.size = 0.3, size=4,
color = "red",fill="white") +
geom_label(label="AUC= 0.925", x=0.07, y=0.97, label.size = 0.3, size=4,
color = "darkred",fill="white")

# ················ Métricas del modelo de SVM ················
Métrica <- c('valor','datos')
Accuracy <- c(acc_svm1,'prueba')
Accuracy. <- c(acc_svm2,'entrenamiento')
AUC_ROC <- c(AUC_svm_te,'prueba')
AUC_ROC. <- c(AUC_svm_tr,'entrenamiento')
# Imprimo resultados de métricas de performance
kable(rbind(Métrica, Accuracy, Accuracy., AUC_ROC, AUC_ROC.))
| Métrica |
valor |
datos |
| Accuracy |
0.513 |
prueba |
| Accuracy. |
0.872 |
entrenamiento |
| AUC_ROC |
0.855 |
prueba |
| AUC_ROC. |
0.925 |
entrenamiento |
MÁQUINAS DE SOPORTE VECTORIAL (SVM) con kernel radial
# Defino modelo SVM
set.seed(1)
task = makeClassifTask(data = datos_tr[-2], target = "diagnosis")
lrn_svm = makeLearner("classif.svm", predict.type = "prob", par.vals = list( kernel = "radial", cost=2))
mod_svm = mlr::train(lrn_svm, task)
# Predicción TEST
pred_svm= predict(mod_svm, newdata = datos_te[-2])
acc_svm1 <- round(measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response),3)
AUC_svm_te <- round(measureAUC(as.data.frame(pred_svm)$prob.maligno,as.data.frame(pred_svm)$truth,'normal','maligno'),3)
# ···························································································
# Predicción TRAIN (naive)
pred_svm2 = predict(mod_lr, newdata = datos_tr[-2]) # por si quiero ver naive sobre training
acc_svm2 <- round(measureACC(as.data.frame(pred_svm2)$truth, as.data.frame(pred_svm2)$response),3)
AUC_svm_tr <- round(measureAUC(as.data.frame(pred_svm2)$prob.maligno,as.data.frame(pred_svm2)$truth, 'normal','maligno'),3)
# ···························································································
# Cambio el threshold [esto lo hago para train y test]
acc=NULL
acc2=NULL
threshold = seq(0.1,0.95,0.01)
for (i in 1:length(threshold)) {
pred = setThreshold(pred_svm, threshold = threshold[i])
acc[i] = measureACC(as.data.frame(pred)$truth, as.data.frame(pred)$response)}
for (i in 1:length(threshold)) {
pred2 = setThreshold(pred_svm2, threshold = threshold[i])
acc2[i] = measureACC(as.data.frame(pred2)$truth, as.data.frame(pred2)$response)}
par(mfcol = c(1,2))
new_df1 <- as.data.frame(cbind(threshold,acc))
new_df1 <- new_df1%>%mutate(sub_data='test')
new_df2 <- as.data.frame(cbind(threshold,acc2))
colnames(new_df2) <- c('threshold','acc')
new_df2 <- new_df2%>%mutate(sub_data='train')
new_df <- as.data.frame(rbind(new_df1,new_df2))
#new_df1[which.max(new_df1$acc),"threshold"] # test 0.39
#new_df2[which.max(new_df2$acc),"threshold"] # train 0.53
# ···························································································
# Gráfico de cómo varía la métrica de performance accuracy, de acuerdo al umbral elegido
ggplot(new_df, aes(x=threshold, y=acc)) + geom_line(aes(color = sub_data,linetype=sub_data)) +
theme +labs(x='Umbral', y='Métrica de performance (accuracy)',
title= 'Evaluación del modelo de Máquinas de soporte vectorial SVM') +
scale_color_manual(values = c("red", "darkred"),labels=c('prueba','entrenamiento')) +
scale_linetype_manual(values=c(1,2), labels=c('prueba','entrenamiento')) +
labs(color='Conjunto de\n evaluación',linetype='Conjunto de\n evaluación')

# Para independizarnos de la elección del umbral, grafico curvas ROC para las predicciones del modelo SVM con los datos de TEST y TRAIN
df_svm = generateThreshVsPerfData(list(svm_te = pred_svm, svm_tr = pred_svm2),
measures = list(fpr, tpr, mmce))
plotROCCurves(df_svm) + theme +
labs(title='Curva ROC del modelo de Máquinas de soporte vectorial SVM kernel radial',
x='Tasa de falsos positivos (FPR)', y='Tasa de positivos verdaderos (TPR)',
color='Conjunto de\n evaluación') +
scale_color_manual(values = c("red", "darkred"), labels=c('prueba','entrenamiento')) +
geom_label(label="AUC= 0.915", x=0.35, y=0.75, label.size = 0.3, size=4,
color = "red",fill="white") +
geom_label(label="AUC= 0.925", x=0.07, y=0.97, label.size = 0.3, size=4,
color = "darkred",fill="white")

# ················ Métricas del modelo de SVM ················
Métrica <- c('valor','datos')
Accuracy <- c(acc_svm1,'prueba')
Accuracy. <- c(acc_svm2,'entrenamiento')
AUC_ROC <- c(AUC_svm_te,'prueba')
AUC_ROC. <- c(AUC_svm_tr,'entrenamiento')
# Imprimo resultados de métricas de performance
kable(rbind(Métrica, Accuracy, Accuracy., AUC_ROC, AUC_ROC.))
| Métrica |
valor |
datos |
| Accuracy |
0.504 |
prueba |
| Accuracy. |
0.872 |
entrenamiento |
| AUC_ROC |
0.915 |
prueba |
| AUC_ROC. |
0.925 |
entrenamiento |
COMPARACIÓN DE MÉTODOS DE CLASIFICACIÓN SUPERVISADA
# todos los test
LDA_metrics <- calculateROCMeasures(pred_lda)
QDA_metrics <- calculateROCMeasures(pred_qda)
RDA_metrics <- calculateROCMeasures(pred_rda)
logreg_metrics <- calculateROCMeasures(pred_lr)
SVM_metrics <- calculateROCMeasures(pred_svm)
# Ingenua para ver cómo le va
pred_todos=NULL
pred_todos_lda <- as.data.frame(predict(mod1, newdata = datos_escalados[-2]))
pred_todos_qda <- as.data.frame(predict(mod2, newdata = datos_escalados[-2]))
pred_todos_rda <- as.data.frame(predict(mod3, newdata = datos_escalados[-2]))
pred_todos_lg <- as.data.frame(predict(mod_lr, newdata = datos_escalados[-2]))
pred_todos_svm <- as.data.frame(predict(mod_svm, newdata = datos_escalados[-2]))
# ······················ PCA datos originales ··················································
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1,alpha=0.5,groups=factor(data$diagnosis)) +
scale_color_manual(name="Diagnóstico", values=c('royalblue2','#ff7474ff'),
labels=c("normal","maligno")) +
theme + labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación de las etiquetas reales\nen las componentes principales 1 y 2')+
theme(legend.position=c(.865,.15))

# ······················ PCA + predicciones LDA ··················································
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1,alpha=.5,groups=factor(pred_todos_lda$response)) +
scale_color_manual(name="Predicción", values=c('royalblue2','#ff7474ff'),
labels=c("normal","maligno")) +
theme + labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación de las predicciones ingenuas del modelo LDA\nen las componentes principales 1 y 2') +
theme(legend.position=c(.865,.15))

# ······················ PCA + predicciones QDA ··················································
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1,alpha=.5,groups=factor(pred_todos_qda$response)) +
scale_color_manual(name="Predicción", values=c('royalblue2','#ff7474ff'),
labels=c("normal","maligno")) +
theme + labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación de las predicciones ingenuas del modelo QDA\nen las componentes principales 1 y 2') +
theme(legend.position=c(.865,.15))

# ······················ PCA + predicciones RDA ··················································
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1,alpha=.5,groups=factor(pred_todos_rda$response)) +
scale_color_manual(name="Predicción", values=c('royalblue2','#ff7474ff'),
labels=c("normal","maligno")) +
theme + labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación de las predicciones ingenuas del modelo RDA\nen las componentes principales 1 y 2') +
theme(legend.position=c(.865,.15))

# ······················ PCA + predicciones LG ··················································
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1,alpha=.5,groups=factor(pred_todos_lg$response)) +
scale_color_manual(name="Predicción", values=c('royalblue2','#ff7474ff'),
labels=c("normal","maligno")) +
theme + labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación de las predicciones ingenuas del modelo LG\nen las componentes principales 1 y 2') +
theme(legend.position=c(.865,.15))

# ······················ PCA + predicciones SVM ·················································
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1,alpha=.5,groups=factor(pred_todos_svm$response)) +
scale_color_manual(name="Predicción", values=c('royalblue2','#ff7474ff'),
labels=c("normal","maligno")) +
theme + labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)', title= 'Representación de las predicciones ingenuas del modelo SVM (r)\nen las componentes principales 1 y 2') +
theme(legend.position=c(.865,.15))

#Coeficientes del modelo de regresión lineal
mod1$learner.model
## Call:
## lda(f, data = getTaskData(.task, .subset))
##
## Prior probabilities of groups:
## normal maligno
## 0.4736842 0.5263158
##
## Group means:
## edad creatinina LYVE1 REG1B TFF1
## normal -0.4690016 -0.08257330 -0.6588756 -0.4497313 -0.4290695
## maligno 0.4221014 0.07431597 0.5929880 0.4047582 0.3861626
##
## Coefficients of linear discriminants:
## LD1
## edad 0.4320184
## creatinina -0.2443376
## LYVE1 1.0128370
## REG1B 0.0640522
## TFF1 0.2226155
# ······················ curvas ROC para todos los modelos ········································
df_todos = generateThreshVsPerfData(list(lda=pred_lda, qda=pred_qda, rda=pred_rda, lg=pred_lr,
svm = pred_svm), measures = list(fpr, tpr, mmce))
plotROCCurves(df_todos) + theme +
labs(title='Curvas ROC de modelos de clasificación supervisada (datos de prueba)',
x='Tasa de falsos positivos (FPR)', y='Tasa de positivos verdaderos (TPR)',
color=' Modelo en\n evaluación') +
scale_color_manual(values = c("red", "black", "blue", "darkgreen","violet"),
labels=c('LDA','Reg log','QDA','RDA','SVM (r)'))+
theme(legend.position=c(0.915,0.25))

# ············ valores AUC para todos los modelos cuando se consideran todas las variables ···················
AUC_values <- rbind(AUC_lda_te, AUC_qda_te, AUC_rda_te, AUC_lg_te, AUC_svm_te)
AUC_values <- as.data.frame(AUC_values)
AUC_values$Modelo <- c('LDA','QDA','RDA','Reg Log','SVM')
colnames(AUC_values) <- c('Area debajo de la curva (AUC)','Modelo')
row.names(AUC_values) <- NULL
AUC_values <- AUC_values%>%dplyr::select(2,1)
# Imprimo resultados
kable(AUC_values)
| LDA |
0.860 |
| QDA |
0.929 |
| RDA |
0.858 |
| Reg Log |
0.889 |
| SVM |
0.915 |
MÉTODOS DE CLASIFICACIÓN NO SUPERVISADA / CLUSTERING
ELECCIÓN DEL NÚMERO DE CLUSTERS
datos_para_cluster = data[3:7]
#analisis de la cantidad de clusters. Este primer bloque es solo para definir funciones.
#se define una funcion para calcular metricas que orientan sobre el numero de clusters a elegir para el problema.
metrica = function(datA_esc,kmax,f) {
sil = array()
sse = array()
datA_dist= dist(datA_esc,method = "euclidean", diag = FALSE, upper = FALSE, p = 2)
for ( i in 2:kmax) { if (strcmp(f,"kmeans")==TRUE) { #centroide: tipico kmeans
CL = kmeans(datA_esc,centers=i,nstart=50,iter.max = kmax)
sse[i] = CL$tot.withinss
CL_sil = silhouette(CL$cluster, datA_dist)
sil[i] = summary(CL_sil)$avg.width}
if (strcmp(f,"pam")==TRUE){ #medoide: ojo porque este metodo tarda muchisimo
CL = pam(x=datA_esc, k=i, diss = F, metric = "euclidean")
sse[i] = CL$objective[1]
sil[i] = CL$silinfo$avg.width}}
sse
sil
return(data.frame(sse,sil))}
#en este bloque se estudia cuantos clusters convendría generar segun indicadores tipicos -> por ejemplo el "Silhouette"
kmax = 10
m1 = metrica(scale(datos_para_cluster),kmax,"kmeans") #tipica con estimadores de la normal
m1 <- m1[complete.cases(m1),]
m1$kcluster <- seq(2,kmax,1)
m1 <- m1%>%dplyr::select(3,1,2)
m1_sse <- m1%>%dplyr::select(-3)%>%mutate(metric='SSE')
colnames(m1_sse) <- c('kcluster','value','metric')
m1_sil <- m1%>%dplyr::select(-2)%>%mutate(metric='SIL')
colnames(m1_sil) <- c('kcluster','value','metric')
m1 <- rbind(m1_sse,m1_sil)
# Grafico de métricas SIL y SSE
ggplot(m1, aes(kcluster, value, linetype=metric)) + geom_line(col='red') +
facet_wrap(~metric, ncol=1, scales='free')+theme+geom_point(col='red', size=2, fill='pink', shape=21)+
labs(title='Determinación de número de clusters',
x='k Número de clusters', y='Valor', linetype='Métrica')+
scale_x_continuous(breaks = seq(1, kmax, by = 1))+
scale_linetype_manual(values=c(1,2))

k-MEANS con k=2
set.seed(1)
cantidad_clusters=2
CL = kmeans(scale(datos_para_cluster),cantidad_clusters)
data$kmeans = CL$cluster
# Grafico scatterplot original + cluster con k=2
par(mfrow=c(1,2))
# -------------------------------------------------------------------------------------------
col1 <- c('royalblue2','#ff7474ff')
col1 <- col1[as.numeric(data$diagnosis)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(col1,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Realidad')
legend("topright", bty = "n", cex = .9, title = "Diagnóstico", c("normal", "maligno"), fill = c('royalblue2','#ff7474ff'))
# -------------------------------------------------------------------------------------------
colors <- c('orange','#a25da2a5')
colors <- colors[as.numeric(data$kmeans)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(colors,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Clustering')
legend("topright", bty = "n", cex = .9, title = "Grupo k-means", c("1", "2"), fill = c('orange','#a25da2a5'))

#conviene en un biplot ya que tengo las flechas de las variables originales
# GRAFICO ORIGINAL
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1,alpha=0.5,
groups=factor(data$diagnosis)) +
scale_color_manual(name="diagnosis",values=c('royalblue2','#ff7474ff'),
labels=c("normal",'maligno')) +
theme + labs(title='Análisis de componentes principales') +
theme(legend.position=c(.85,.15)) +
labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)')

# -------------------------------------------------------------------------------------------
# GRAFICO KMEANS
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1, alpha=0.5,groups = as.factor(data$kmeans) )+
scale_color_manual(name="Grupo k-means", values=c("orange",'#a25da2a5',"darkgreen"),
labels=c("1", "2","3")) + theme+
labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación del clustering utilizando k-means')+theme(legend.position=c(.85,.15))

# ·····················································
# cuántos pacientes de cada diagnóstico están en cada cluster:
pacientes_cluster1 <- data %>% filter (kmeans == '1')
cluster1 <- table(pacientes_cluster1$diagnosis)
porcentaje_cluster.1 <- round(prop.table(cluster1)*100,2)
pacientes_cluster2 <- data %>% filter (kmeans == '2')
cluster2 <- table(pacientes_cluster2$diagnosis)
porcentaje_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
# Imprimo resultados
kable(cbind(rbind(cluster1,cluster2),rbind(porcentaje_cluster.1,porcentaje_cluster.2)))
| cluster1 |
5 |
83 |
5.68 |
94.32 |
| cluster2 |
178 |
115 |
60.75 |
39.25 |
k-MEANS con k=2 sin escalar
cantidad_clusters=2
set.seed(1)
CL = kmeans(datos_para_cluster,cantidad_clusters)
data$kmeans = CL$cluster
par(mfrow=c(1,2))
# -------------------------------------------------------------------------------------------
# GRAFICO ORIGINAL
col1 <- c('royalblue2','#ff7474ff')
col1 <- col1[as.numeric(data$diagnosis)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(col1,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Realidad')
legend("topright", bty = "n", cex = .9, title = "Diagnóstico", c("normal", "maligno"), fill = c('royalblue2','#ff7474ff'))
# -------------------------------------------------------------------------------------------
# GRAFICO CLUSTERS
colors <- c('orange','#a25da2a5')
colors <- colors[as.numeric(data$kmeans)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(colors,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Clustering')
legend("topright", bty = "n", cex = .9, title = "Grupo k-means", c("1", "2"), fill = c('orange','#a25da2a5'))

# -------------------------------------------------------------------------------------------
#conviene en un biplot ya que tengo las flechas de las variables originales
# GRAFICO CLUSTER EN PCA 1 Y 2
ggbiplot(datos.pc, obs.scale=.1 ,var.scale=1, alpha=0.5,groups = as.factor(data$kmeans) )+
scale_color_manual(name="Grupo k-means", values=c("orange",'#a25da2a5',"darkgreen"),
labels=c("1", "2","3")) + theme+
labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación del clustering utilizando k-means')+theme(legend.position=c(.865,.2)) +
scale_x_continuous(breaks = seq(-2.5, 7.5, by = 2.5))

# ·····················································
# cuántos pacientes de cada diagnóstico están en cada cluster:
pacientes_cluster1 <- data %>% filter (kmeans == '1')
cluster1 <- table(pacientes_cluster1$diagnosis)
porcentaje_cluster.1 <- round(prop.table(cluster1)*100,2)
pacientes_cluster2 <- data %>% filter (kmeans == '2')
cluster2 <- table(pacientes_cluster2$diagnosis)
porcentaje_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
# Imprimo resultados
kable(cbind(rbind(cluster1,cluster2),rbind(porcentaje_cluster.1,porcentaje_cluster.2)))
| cluster1 |
1 |
40 |
2.44 |
97.56 |
| cluster2 |
182 |
158 |
53.53 |
46.47 |
k-MEANS con k=3
cantidad_clusters=2
set.seed(1)
CL = kmeans(scale(datos_para_cluster),cantidad_clusters)
data$kmeans = CL$cluster
par(mar=c(0, .5, 5.5, 0.5),mfrow=c(1,3))
# -------------------------------------------------------------------------------------------
# GRAFICO ORIGINAL
col1 <- c('royalblue2','#ff7474ff')
col1 <- col1[as.numeric(data$diagnosis)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(col1,0.3), box=F,angle=25, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Realidad', cex.lab=.8,scale.y=2, cex.symbols=1.3)
legend(x=4, y=5.5, bty = "n", cex =1.1, title = "Diagnóstico", c("normal", "maligno"), fill = c('royalblue2','#ff7474ff'))
# -------------------------------------------------------------------------------------------
# GRAFICO K=2
colors <- c('orange','#a25da2a5')
colors <- colors[as.numeric(data$kmeans)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(colors,0.3), box=F,angle=25, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Clustering k=2', cex.lab=.8,scale.y=2, cex.symbols=1.3)
legend(x=4, y=5.5, bty = "n", cex = 1.1, title = "Grupo k-means", c("1", "2"), fill = c('orange','#a25da2a5'))
# -------------------------------------------------------------------------------------------
# ENTRENO K MEANS CON K=3
cantidad_clusters=3
set.seed(1)
CL = kmeans(scale(datos_para_cluster),cantidad_clusters)
data$kmeans = CL$cluster
colors2 <- c('orange','#a25da2a5', 'darkgreen')
colors2 <- colors2[as.numeric(data$kmeans)]
# -------------------------------------------------------------------------------------------
# GRAFICO K=3
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(colors2,0.3), box=F,angle=25, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Clustering k=3', cex.lab=.8,scale.y=2, cex.symbols=1.3)
legend(x=4, y=5.5, bty = "n", cex = 1.1, title = "Grupo k-means", c("1", "2","3"), fill = c('orange','#a25da2a5','darkgreen'))

# -------------------------------------------------------------------------------------------
# GRAFICOS K MEANS EN ACP COORD
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1,alpha=0.5,
groups=factor(data$diagnosis)) +
scale_color_manual(name="diagnosis",values=c('royalblue2','#ff7474ff'),
labels=c("normal",'maligno')) +
theme + labs(title='Análisis de componentes principales') +
theme(legend.position=c(.85,.15)) +
labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)')

ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1, alpha=0.5,groups = as.factor(data$kmeans) )+
scale_color_manual(name="Grupo k-means", values=c("orange",'#a25da2a5',"darkgreen"),
labels=c("1", "2","3")) + theme+
labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación del clustering utilizando k-means')+theme(legend.position=c(.85,.15))

# ·····················································
# cuántos pacientes de cada diagnóstico están en cada cluster:
pacientes_cluster1 <- data %>% filter (kmeans == '1')
cluster1 <- table(pacientes_cluster1$diagnosis)
porcentaje_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
pacientes_cluster2 <- data %>% filter (kmeans == '2')
cluster2 <- table(pacientes_cluster2$diagnosis)
porcentaje_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
pacientes_cluster3 <- data %>% filter (kmeans == '3')
cluster3 <- table(pacientes_cluster3$diagnosis)
porcentaje_cluster.3 <- round(prop.table(cluster3)*100,2)
# ·····················································
# Imprimo resultados
kable(cbind(rbind(cluster1,cluster2,cluster3),rbind(porcentaje_cluster.1,porcentaje_cluster.2,porcentaje_cluster.3)))
| cluster1 |
0 |
25 |
0.00 |
100.00 |
| cluster2 |
167 |
70 |
70.46 |
29.54 |
| cluster3 |
16 |
103 |
13.45 |
86.55 |
k-MEANS con k=3 sin escalar
cantidad_clusters=3
CL = kmeans(datos_para_cluster,cantidad_clusters)
data$kmeans = CL$cluster
par(mfrow=c(1,2))
col1 <- c('royalblue2','#ff7474ff')
col1 <- col1[as.numeric(data$diagnosis)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color =alpha(col1,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Realidad')
legend("topright", bty = "n", cex = .9, title = "Diagnóstico", c("normal", "maligno"), fill = c('royalblue2','#ff7474ff'))
colors <- c('orange','#a25da2a5', 'darkgreen')
colors <- colors[as.numeric(data$kmeans)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(colors,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Clustering')
legend("topright", bty = "n", cex = .9, title = "Grupo k-means", c("1", "2","3"), fill = c('orange','#a25da2a5','darkgreen'))

#conviene en un biplot ya que tengo las flechas de las variables originales
ggbiplot(datos.pc, obs.scale=0.1 ,var.scale=1, alpha=0.5,groups = as.factor(data$kmeans) )+
scale_color_manual(name="Grupo k-means", values=c("orange",'#a25da2a5',"darkgreen"),
labels=c("1", "2","3")) + theme+
labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación del clustering utilizando k-means')+theme(legend.position=c(.865,.2))

# ·····················································
# cuántos pacientes de cada diagnóstico están en cada cluster:
pacientes_cluster1 <- data %>% filter (kmeans == '1')
cluster1 <- table(pacientes_cluster1$diagnosis)
porcentaje_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
pacientes_cluster2 <- data %>% filter (kmeans == '2')
cluster2 <- table(pacientes_cluster2$diagnosis)
porcentaje_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
pacientes_cluster3 <- data %>% filter (kmeans == '3')
cluster3 <- table(pacientes_cluster3$diagnosis)
porcentaje_cluster.3 <- round(prop.table(cluster3)*100,2)
# ·····················································
kable(cbind(rbind(cluster1,cluster2,cluster3),rbind(porcentaje_cluster.1,porcentaje_cluster.2,porcentaje_cluster.3)))
| cluster1 |
0 |
24 |
0.00 |
100.00 |
| cluster2 |
8 |
64 |
11.11 |
88.89 |
| cluster3 |
175 |
110 |
61.40 |
38.60 |
k-MEANS con k=2 sin TFF1 (variable), estandarizado
cantidad_clusters=2
datos_para_cluster = data[3:6]
set.seed(1)
CL = kmeans(scale(datos_para_cluster),cantidad_clusters)
data$kmeans = CL$cluster
par(mfrow=c(1,2))
# -------------------------------------------------------------------------------------------
# GRAFICO original
col1 <- c('royalblue2','#ff7474ff')
col1 <- col1[as.numeric(data$diagnosis)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(col1,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Realidad')
legend("topright", bty = "n", cex = .9, title = "Diagnóstico", c("normal", "maligno"), fill = c('royalblue2','#ff7474ff'))
# -------------------------------------------------------------------------------------------
# GRAFICO K=2 sin TFF1
colors <- c('orange','#a25da2a5')
colors <- colors[as.numeric(data$kmeans)]
scatterplot3d(data$edad,data$REG1B,data$LYVE1, color = alpha(colors,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "REG1B", zlab = "LYVE1", main='Clustering')
legend("topright", bty = "n", cex = .9, title = "Grupo k-means", c("1", "2"), fill = c('orange','#a25da2a5'))

# -------------------------------------------------------------------------------------------
# GRAFICO K=2 sin TFF1 en coordenadas ACP
#conviene en un biplot ya que tengo las flechas de las variables originales
ggbiplot(datos.pc, obs.scale=.01 ,var.scale=1, alpha=0.5,groups = as.factor(data$kmeans) )+
scale_color_manual(name="Grupo k-means", values=c("orange",'#a25da2a5',"darkgreen"),
labels=c("1", "2","3")) + theme+
labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación del clustering utilizando k-means')+theme(legend.position=c(.865,.2))

# ·····················································
# cuántos pacientes de cada diagnóstico están en cada cluster:
pacientes_cluster1 <- data %>% filter (kmeans == '1')
cluster1 <- table(pacientes_cluster1$diagnosis)
porcentaje_cluster.1 <- round(prop.table(cluster1)*100,2)
pacientes_cluster2 <- data %>% filter (kmeans == '2')
cluster2 <- table(pacientes_cluster2$diagnosis)
porcentaje_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
# Imprimo resultados
kable(cbind(rbind(cluster1,cluster2),rbind(porcentaje_cluster.1,porcentaje_cluster.2)))
| cluster1 |
13 |
108 |
10.74 |
89.26 |
| cluster2 |
170 |
90 |
65.38 |
34.62 |
k-MEANS con k=2 sin REG1B ni creatinina (variable), estandarizado
set.seed(1234)
cantidad_clusters=2
datos_para_cluster = data[c(3,5,7)]
CL = kmeans(scale(datos_para_cluster),cantidad_clusters)
data$kmeans = CL$cluster
par(mfrow=c(1,2))
# -------------------------------------------------------------------------------------------
# GRAFICO K=2 original
col1 <- c('royalblue2','#ff7474ff')
col1 <- col1[as.numeric(data$diagnosis)]
scatterplot3d(data$edad,data$TFF1,data$LYVE1, color = alpha(col1,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "TFF1", zlab = "LYVE1", main='Realidad')
legend("topright", bty = "n", cex = .9, title = "Diagnóstico", c("normal", "maligno"), fill = c('royalblue2','#ff7474ff'))
colors <- c('orange','#a25da2a5')
colors <- colors[as.numeric(data$kmeans)]
# -------------------------------------------------------------------------------------------
# GRAFICO K=2 sin creatinina ni REG1B (clusteres)
scatterplot3d(data$edad,data$TFF1,data$LYVE1, color = alpha(colors,0.3), box=F,angle=45, pch = 19, grid = TRUE, tick.marks = FALSE, xlab = "edad", ylab = "TFF1", zlab = "LYVE1", main='Clustering')
legend("topright", bty = "n", cex = .9, title = "Grupo k-means", c("1", "2"), fill = c('orange','#a25da2a5'))

# -------------------------------------------------------------------------------------------
# GRAFICO kmeans con k=2 sin variables, en coord ACP
#conviene en un biplot ya que tengo las flechas de las variables originales
ggbiplot(datos.pc, obs.scale=.01 ,var.scale=1, alpha=0.5,groups = as.factor(data$kmeans) )+
scale_color_manual(name="Grupo k-means", values=c("orange",'#a25da2a5',"darkgreen"),
labels=c("1", "2","3")) + theme+
labs(x='PC1 (51.5% explicado)', y= 'PC2 (22.5% explicado)',
title= 'Representación del clustering utilizando k-means')+theme(legend.position=c(.865,.2))

# ·····················································
# cuántos pacientes de cada diagnóstico están en cada cluster:
pacientes_cluster1 <- data %>% filter (kmeans == '1')
cluster1 <- table(pacientes_cluster1$diagnosis)
porcentaje_cluster.1 <- round(prop.table(cluster1)*100,2)
pacientes_cluster2 <- data %>% filter (kmeans == '2')
cluster2 <- table(pacientes_cluster2$diagnosis)
porcentaje_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
# Imprimo resultados
kable(cbind(rbind(cluster1,cluster2),rbind(porcentaje_cluster.1,porcentaje_cluster.2)))
| cluster1 |
177 |
81 |
68.60 |
31.40 |
| cluster2 |
6 |
117 |
4.88 |
95.12 |
k-MEANS con k=2 sin REG1B ni creatinina (variable), estandarizado
calculo de distribución de cada variable en los datos de acuerdo al diagnóstico o el cluster Análisis estadístico z para evaluar diferencia de medias entre dos grupos (TCL por n>30)
comp_media1 <- data %>% gather(-c(1,2,4,6,8), key = "var", value = "value") %>%
ggplot(aes(x = as.factor(diagnosis), y = value)) +theme(plot.margin = unit(c(2.0,0.4,0,.7), "cm"))+
geom_boxplot(aes(fill=diagnosis, alpha=0.6), outlier.colour='gray', outlier.shape=1, outlier.size=1) +
facet_wrap(~ var, scales = "free") +theme(legend.position='none')+
theme+labs(x="Diagnóstico", y= NULL)+scale_fill_manual(values=c('royalblue2','#ff7474ff'))
data$kmeans <- factor(data$kmeans, levels= c(1,2))
comp_media2 <-data %>% gather(-c(1,2,4,6,8), key = "var", value = "value") %>%
ggplot(aes(x = as.factor(kmeans), y = value)) +theme(plot.margin = unit(c(0.5,0.4,0,.7), "cm"))+
geom_boxplot(aes(fill=kmeans, alpha=0.6), outlier.colour='gray', outlier.shape=1, outlier.size=1) +
facet_wrap(~ var, scales = "free") +theme(legend.position='none')+
theme+labs(x="Cluster", y= NULL)+scale_fill_manual(values=c('orange','#a25da2a5'))
plot_grid(comp_media1, comp_media2, nrow = 2,rel_heights = c(1, .8))+
annotate("text", x=.5, y=.9, size=4, label='atop(bold("Comparación de variables"),"según diagnóstico y cluster")', parse=TRUE)+
annotate("text", angle=90,x=0.02, y=.5, size=4, label='bold("Valor estandarizado de la variable")', parse=TRUE)

# ya sabíamos del qqplot que las variables no son normales univariadas (si se separan por diagnóstico). por TLC, no obstante, uno puede aproximar las univariadas y hacer un test acorde para ver diferencias de medias
pacientes_N <- data%>%filter(diagnosis=='normal')
pacientes_M <- data%>%filter(diagnosis=='maligno')
edad <- z.test(pacientes_N$edad, sigma.x=sd(pacientes_N$edad), pacientes_M$edad, sigma.y=sd(pacientes_M$edad), conf.level=0.95)$p.value
LYVE1 <- z.test(pacientes_N$LYVE1, sigma.x=sd(pacientes_N$LYVE1), pacientes_M$LYVE1, sigma.y=sd(pacientes_M$LYVE1), conf.level=0.95)$p.value
TFF1 <- z.test(pacientes_N$TFF1, sigma.x=sd(pacientes_N$TFF1), pacientes_M$TFF1, sigma.y=sd(pacientes_M$TFF1), conf.level=0.95)$p.value
pacientes_1 <- data%>%filter(kmeans=='1')
pacientes_2 <- data%>%filter(kmeans=='2')
edad_c <- z.test(pacientes_1$edad, sigma.x=sd(pacientes_1$edad), pacientes_2$edad, sigma.y=sd(pacientes_2$edad), conf.level=0.95)$p.value
LYVE1_C <- z.test(pacientes_1$LYVE1, sigma.x=sd(pacientes_1$LYVE1), pacientes_2$LYVE1, sigma.y=sd(pacientes_2$LYVE1), conf.level=0.95)$p.value
TFF1_c <- z.test(pacientes_1$TFF1, sigma.x=sd(pacientes_1$TFF1), pacientes_2$TFF1, sigma.y=sd(pacientes_2$TFF1), conf.level=0.95)$p.value
variable <- c('diagnóstico','cluster')
# Imprimo resultados
kable(rbind(variable,cbind(rbind(edad, LYVE1, TFF1),rbind(edad_c, LYVE1_C, TFF1_c))))
| variable |
diagnóstico |
cluster |
| edad |
5.99013023227831e-17 |
5.82384399900471e-26 |
| LYVE1 |
6.57093753385587e-54 |
1.97729701568712e-129 |
| TFF1 |
5.40610601825638e-21 |
1.78929076732458e-20 |
Clustering jerárquico con muestra balanceada (para diagnóstico) de tamaño n=100
Cálculo de matriz de distancias
Euclidea
# Elijo un subset de 100 datos para realizar el dendograma
cantidad_clusters=2
set.seed(1407)
par(mfcol = c(1,1))
data_c_diag = data[-c(2,8)] # sin sexo ni kmeans columns
sample_cluster1 <- data_c_diag[sample(1:nrow(data_c_diag), 100,replace=FALSE),]
sample_cluster <- as.data.frame(scale(sample_cluster1[,2:6]))
sample_cluster$diagnosis <- sample_cluster1$diagnosis
sample_cluster <- sample_cluster%>%dplyr::select(6,1:5)
# Imprimo cuántos valores hay en cada categoría de la variable diagnóstico en mi muestra de n=100
kable(table(sample_cluster$diagnosis))
# Escalo los datos y hago PCA
datos.pc2 = prcomp(sample_cluster[-1],scale = TRUE)
# Matriz de distancias euclídeas
mat_dist <- dist(x = sample_cluster[-1], method = "euclidean")
# Dendrogramas (según el tipo de segmentación jerárquica aplicada)
hc_complete <- hclust(d = mat_dist, method = "complete")
hc_average <- hclust(d = mat_dist, method = "average")
hc_single <- hclust(d = mat_dist, method = "single")
hc_ward <- hclust(d = mat_dist, method = "ward.D2")
#calculo del coeficiente de correlacion cofenetico
completo <- round(cor(x = mat_dist, cophenetic(hc_complete)),3)
promedio <- round(cor(x = mat_dist, cophenetic(hc_average)),3)
simple <- round(cor(x = mat_dist, cophenetic(hc_single)),3)
ward <- round(cor(x = mat_dist, cophenetic(hc_ward)),3)
valores_coef <- cbind(completo,promedio,simple,ward)
# Imprimo valores de coeficiente cofenético
kable(valores_coef)
Manhattan (esta es la que informo en TP, porque da mejores resultados)
# mismo subset
cantidad_clusters=2
set.seed(1407)
par(mfcol = c(1,1))
data_c_diag = data[-c(2,8)] # sin sexo ni kmeans columns
sample_cluster1 <- data_c_diag[sample(1:nrow(data_c_diag), 100,replace=FALSE),]
sample_cluster <- as.data.frame(scale(sample_cluster1[,2:6]))
sample_cluster$diagnosis <- sample_cluster1$diagnosis
sample_cluster <- sample_cluster%>%dplyr::select(6,1:5)
# Escalo datos y hago PCA
datos.pc2 = prcomp(sample_cluster[-1],scale = TRUE)
# Matriz de distancias manhattan
mat_dist <- dist(x = sample_cluster[-1], method = "manhattan")
# Dendrogramas (según el tipo de segmentación jerárquica aplicada)
hc_complete <- hclust(d = mat_dist, method = "complete")
hc_average <- hclust(d = mat_dist, method = "average")
hc_single <- hclust(d = mat_dist, method = "single")
hc_ward <- hclust(d = mat_dist, method = "ward.D2")
#calculo del coeficiente de correlacion cofenetico
completo <- round(cor(x = mat_dist, cophenetic(hc_complete)),3)
promedio <- round(cor(x = mat_dist, cophenetic(hc_average)),3)
simple <- round(cor(x = mat_dist, cophenetic(hc_single)),3)
ward <- round(cor(x = mat_dist, cophenetic(hc_ward)),3)
valores_coef <- cbind(completo,promedio,simple,ward)
# Imprimo valores de coeficiente cofenético
kable(valores_coef)
CLUSTERING JERÁRQUICO con clusters= 2
# Armo clusters
jer_ward<-cutree(hc_ward,k=cantidad_clusters)
jer_average<-cutree(hc_average,k=cantidad_clusters)
jer_complete<-cutree(hc_complete,k=cantidad_clusters)
jer_single<-cutree(hc_single,k=cantidad_clusters)
# Agrego cluster a dataframe
sample_cluster$jer_ward=jer_ward
sample_cluster$jer_average=jer_average
sample_cluster$jer_complete=jer_complete
sample_cluster$jer_single=jer_single
Construcción de dendograma con distancia Ward
# construccion de dendograma WARD
mar = c(5.1, 4.1, 4.1, 2.1)
pch=c('royalblue2','#ff7474ff')
cols=alpha(pch[sample_cluster$diagnosis[order.dendrogram(as.dendrogram(hc_ward))]],0.7)
dend_ward <- color_branches(as.dendrogram(hc_ward), k = 2)
dend_ward <- set(dend_ward, "labels_cex", 0.1)
grafico1 <- dend_ward %>% set("leaves_pch",19)%>%
set("leaves_cex", .9) %>% set("leaves_col", cols) %>%
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.8, 'Distancia Ward')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(5,28, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia promedio
cols_a=alpha(pch[sample_cluster$diagnosis[order.dendrogram(as.dendrogram(hc_average))]],0.7)
dend_average <- color_branches(as.dendrogram(hc_average), k = 2)
dend_average <- set(dend_average, "labels_cex", 0.1)
grafico2 <- dend_average %>% set("leaves_pch",19) %>%
set("leaves_cex", .8) %>% set("leaves_col", cols_a) %>%
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.3, 'Distancia Promedio')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(76,8, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia completa
cols_c=alpha(pch[sample_cluster$diagnosis[order.dendrogram(as.dendrogram(hc_complete))]],0.7)
dend_complete <- color_branches(as.dendrogram(hc_complete), k = 2)
dend_complete <- set(dend_complete, "labels_cex", 0.1)
grafico3 <- dend_complete %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_c) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.3, 'Distancia Completa')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(85,16, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia simple
cols_s=alpha(pch[sample_cluster$diagnosis[order.dendrogram(as.dendrogram(hc_single))]],0.7)
dend_single <- color_branches(as.dendrogram(hc_single), k = 2)
dend_single <- set(dend_single, "labels_cex", 0.1)
grafico4 <- dend_single %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_s) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.7)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.7, 'Distancia Simple')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(80,7, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Cálculo de cuántos pacientes de cada diagnóstico hay en cada cluster según las distintas distancias utilizadas Ward
# ·····················································
# cuántos pacientes de cada diagnóstico están en cada cluster:
ward_cluster1 <- sample_cluster %>% filter (jer_ward == '1')
cluster1 <- table(ward_cluster1$diagnosis)
Ward_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
ward_cluster2 <- sample_cluster %>% filter (jer_ward == '2')
cluster2 <- table(ward_cluster2$diagnosis)
Ward_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(Ward_cluster.1,Ward_cluster.2)))
| cluster1 |
48 |
27 |
64 |
36 |
| cluster2 |
2 |
23 |
8 |
92 |
Promedio
# ·····················································
promedio_cluster1 <- sample_cluster %>% filter (jer_average == '1')
cluster1 <- table(promedio_cluster1$diagnosis)
promedio_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
promedio_cluster2 <- sample_cluster %>% filter (jer_average == '2')
cluster2 <- table(promedio_cluster2$diagnosis)
promedio_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(promedio_cluster.1,promedio_cluster.2)))
| cluster1 |
50 |
49 |
50.51 |
49.49 |
| cluster2 |
0 |
1 |
0.00 |
100.00 |
Completo
# ·····················································
completo_cluster1 <- sample_cluster %>% filter (jer_complete == '1')
cluster1 <- table(completo_cluster1$diagnosis)
completo_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
completo_cluster2 <- sample_cluster %>% filter (jer_complete == '2')
cluster2 <- table(completo_cluster2$diagnosis)
completo_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(completo_cluster.1,completo_cluster.2)))
| cluster1 |
50 |
45 |
52.63 |
47.37 |
| cluster2 |
0 |
5 |
0.00 |
100.00 |
Simple
# ·····················································
simple_cluster1 <- sample_cluster %>% filter (jer_single == '1')
cluster1 <- table(simple_cluster1$diagnosis)
simple_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
simple_cluster2 <- sample_cluster %>% filter (jer_single == '2')
cluster2 <- table(simple_cluster2$diagnosis)
simple_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(simple_cluster.1,simple_cluster.2)))
| cluster1 |
50 |
49 |
50.51 |
49.49 |
| cluster2 |
0 |
1 |
0.00 |
100.00 |
CLUSTERING JERÁRQUICO 3
# Armo clusters
cantidad_clusters=3
jer_ward3<-cutree(hc_ward,k=cantidad_clusters)
jer_average3<-cutree(hc_average,k=cantidad_clusters)
jer_complete3<-cutree(hc_complete,k=cantidad_clusters)
jer_single3<-cutree(hc_single,k=cantidad_clusters)
# Agrego clusters al dataframe
sample_cluster$jer_ward3=jer_ward3
sample_cluster$jer_average3=jer_average3
sample_cluster$jer_complete3=jer_complete3
sample_cluster$jer_single3=jer_single3
Construcción de dendograma con distancia Ward
mar = c(5.1, 4.1, 4.1, 2.1)
cols_w=alpha(pch[sample_cluster$diagnosis[order.dendrogram(as.dendrogram(hc_ward))]],0.7)
dend_ward <- color_branches(as.dendrogram(hc_ward), k = 3)
dend_ward <- set(dend_ward, "labels_cex", 0.1)
grafico5 <- dend_ward %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_w) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.8, 'Distancia Ward')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(3,32, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia Promedio
dend_average <- color_branches(as.dendrogram(hc_average), k = 3)
dend_average <- set(dend_average, "labels_cex", 0.1)
grafico6 <- dend_average %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_a) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.4, 'Distancia Promedio')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(90,12.2, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia Completa
dend_complete <- color_branches(as.dendrogram(hc_complete), k = 3)
dend_complete <- set(dend_complete, "labels_cex", 0.1)
grafico7 <- dend_complete %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_c) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.4, 'Distancia Completa')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(85,18, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia Simple
dend_single <- color_branches(as.dendrogram(hc_single), k = 3)
dend_single <- set(dend_single, "labels_cex", 0.1)
grafico8 <- dend_single %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_s) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.7)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.6, 'Distancia Simple')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(80,7, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Cálculo de cuántos pacientes de cada diagnóstico hay en cada cluster según las distintas distancias utilizadas Ward
ward_cluster1 <- sample_cluster %>% filter (jer_ward3 == '1')
cluster1 <- table(ward_cluster1$diagnosis)
Ward_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
ward_cluster2 <- sample_cluster %>% filter (jer_ward3 == '2')
cluster2 <- table(ward_cluster2$diagnosis)
Ward_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
ward_cluster3 <- sample_cluster %>% filter (jer_ward3 == '3')
cluster3 <- table(ward_cluster3$diagnosis)
Ward_cluster.3 <- round(prop.table(cluster3)*100,2)
kable(cbind(rbind(cluster1,cluster2,cluster3),rbind(Ward_cluster.1,Ward_cluster.2,Ward_cluster.3)))
| cluster1 |
48 |
27 |
64.0 |
36.0 |
| cluster2 |
0 |
9 |
0.0 |
100.0 |
| cluster3 |
2 |
14 |
12.5 |
87.5 |
Promedio
promedio_cluster1 <- sample_cluster %>% filter (jer_average3 == '1')
cluster1 <- table(promedio_cluster1$diagnosis)
promedio_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
promedio_cluster2 <- sample_cluster %>% filter (jer_average3 == '2')
cluster2 <- table(promedio_cluster2$diagnosis)
promedio_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
promedio_cluster3 <- sample_cluster %>% filter (jer_average3 == '3')
cluster3 <- table(promedio_cluster3$diagnosis)
promedio_cluster.3 <- round(prop.table(cluster3)*100,2)
# ·····················································
kable(cbind(rbind(cluster1,cluster2,cluster3),rbind(promedio_cluster.1,promedio_cluster.2,promedio_cluster.3)))
| cluster1 |
50 |
41 |
54.95 |
45.05 |
| cluster2 |
0 |
8 |
0.00 |
100.00 |
| cluster3 |
0 |
1 |
0.00 |
100.00 |
Completo
completo_cluster1 <- sample_cluster %>% filter (jer_complete3 == '1')
cluster1 <- table(completo_cluster1$diagnosis)
completo_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
completo_cluster2 <- sample_cluster %>% filter (jer_complete3 == '2')
cluster2 <- table(completo_cluster2$diagnosis)
completo_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
completo_cluster3 <- sample_cluster %>% filter (jer_complete3 == '3')
cluster3 <- table(completo_cluster3$diagnosis)
completo_cluster.3 <- round(prop.table(cluster3)*100,2)
kable(cbind(rbind(cluster1,cluster2,cluster3),rbind(completo_cluster.1,completo_cluster.2,completo_cluster.3)))
| cluster1 |
49 |
29 |
62.82 |
37.18 |
| cluster2 |
1 |
16 |
5.88 |
94.12 |
| cluster3 |
0 |
5 |
0.00 |
100.00 |
Simple
simple_cluster1 <- sample_cluster %>% filter (jer_single3 == '1')
cluster1 <- table(simple_cluster1$diagnosis)
simple_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
simple_cluster2 <- sample_cluster %>% filter (jer_single3 == '2')
cluster2 <- table(simple_cluster2$diagnosis)
simple_cluster.2 <- round(prop.table(cluster2)*100,2)
# ·····················································
simple_cluster3 <- sample_cluster %>% filter (jer_single3 == '3')
cluster3 <- table(simple_cluster3$diagnosis)
simple_cluster.3 <- round(prop.table(cluster3)*100,2)
kable(cbind(rbind(cluster1,cluster2,cluster3),rbind(simple_cluster.1,simple_cluster.2,simple_cluster.3)))
| cluster1 |
50 |
48 |
51.02 |
48.98 |
| cluster2 |
0 |
1 |
0.00 |
100.00 |
| cluster3 |
0 |
1 |
0.00 |
100.00 |
CLUSTERING JERÁRQUICO 2 sin TFF1
# mismo subset
cantidad_clusters=2
set.seed(1407)
par(mfcol = c(1,1))
data_c_diag = data[-c(2,8)] # sin sexo ni kmeans columns
sample_cluster1 <- data_c_diag[sample(1:nrow(data_c_diag), 100,replace=FALSE),]
sample_cluster <- as.data.frame(scale(sample_cluster1[,2:6]))
sample_cluster$diagnosis <- sample_cluster1$diagnosis
sample_cluster_sinTFF <- sample_cluster%>%dplyr::select(6,1:4) # aca le saco TFF1
# Matriz de distancias manhattan
mat_dist <- dist(x = sample_cluster[-1], method = "manhattan")
## Warning in dist(x = sample_cluster[-1], method = "manhattan"): NAs introduced by
## coercion
# Dendrogramas (según el tipo de segmentación jerárquica aplicada)
hc_complete <- hclust(d = mat_dist, method = "complete")
hc_average <- hclust(d = mat_dist, method = "average")
hc_single <- hclust(d = mat_dist, method = "single")
hc_ward <- hclust(d = mat_dist, method = "ward.D2")
# Calculo del coeficiente de correlacion cofenetico
completo <- round(cor(x = mat_dist, cophenetic(hc_complete)),3)
promedio <- round(cor(x = mat_dist, cophenetic(hc_average)),3)
simple <- round(cor(x = mat_dist, cophenetic(hc_single)),3)
ward <- round(cor(x = mat_dist, cophenetic(hc_ward)),3)
valores_coef <- cbind(completo,promedio,simple,ward)
# Imprimo valores de coeficiente cofenético
kable(valores_coef)
# Armo clusters
jer_ward<-cutree(hc_ward,k=cantidad_clusters)
jer_average<-cutree(hc_average,k=cantidad_clusters)
jer_complete<-cutree(hc_complete,k=cantidad_clusters)
jer_single<-cutree(hc_single,k=cantidad_clusters)
# Agrego info a data frame
sample_cluster_sinTFF$jer_ward=jer_ward
sample_cluster_sinTFF$jer_average=jer_average
sample_cluster_sinTFF$jer_complete=jer_complete
sample_cluster_sinTFF$jer_single=jer_single
Construcción de dendograma con distancia Ward
# construccion de dendogramas
mar = c(5.1, 4.1, 4.1, 2.1)
pch=c('royalblue2','#ff7474ff')
cols=alpha(pch[sample_cluster_sinTFF$diagnosis[order.dendrogram(as.dendrogram(hc_ward))]],0.7)
dend_ward <- color_branches(as.dendrogram(hc_ward), k = 2)
dend_ward <- set(dend_ward, "labels_cex", 0.1)
grafico1 <- dend_ward %>% set("leaves_pch",19)%>%
set("leaves_cex", .9) %>% # node point size
set("leaves_col", cols) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.8, 'Distancia Ward')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(85,40, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia Promedio
cols_a=alpha(pch[sample_cluster_sinTFF$diagnosis[order.dendrogram(as.dendrogram(hc_average))]],0.7)
dend_average <- color_branches(as.dendrogram(hc_average), k = 2)
dend_average <- set(dend_average, "labels_cex", 0.1)
grafico2 <- dend_average %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_a) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.3, 'Distancia Promedio')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(90,14.2, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia completa
cols_c=alpha(pch[sample_cluster_sinTFF$diagnosis[order.dendrogram(as.dendrogram(hc_complete))]],0.7)
dend_complete <- color_branches(as.dendrogram(hc_complete), k = 2)
dend_complete <- set(dend_complete, "labels_cex", 0.1)
grafico3 <- dend_complete %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_c) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.3, 'Distancia Completa')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(5,18, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia Simple
cols_s=alpha(pch[sample_cluster_sinTFF$diagnosis[order.dendrogram(as.dendrogram(hc_single))]],0.7)
dend_single <- color_branches(as.dendrogram(hc_single), k = 2)
dend_single <- set(dend_single, "labels_cex", 0.1)
grafico4 <- dend_single %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_s) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.7)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.7, 'Distancia Simple')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(80,7, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Ward
# ·····················································
# cuántos pacientes de cada diagnóstico están en cada cluster:
ward_cluster1 <- sample_cluster_sinTFF %>% filter (jer_ward == '1')
cluster1 <- table(ward_cluster1$diagnosis)
Ward_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
ward_cluster2 <- sample_cluster_sinTFF %>% filter (jer_ward == '2')
cluster2 <- table(ward_cluster2$diagnosis)
Ward_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(Ward_cluster.1,Ward_cluster.2)))
| cluster1 |
50 |
43 |
53.76 |
46.24 |
| cluster2 |
0 |
7 |
0.00 |
100.00 |
Promedio
# ·····················································
promedio_cluster1 <- sample_cluster_sinTFF %>% filter (jer_average == '1')
cluster1 <- table(promedio_cluster1$diagnosis)
promedio_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
promedio_cluster2 <- sample_cluster_sinTFF %>% filter (jer_average == '2')
cluster2 <- table(promedio_cluster2$diagnosis)
promedio_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(promedio_cluster.1,promedio_cluster.2)))
| cluster1 |
50 |
49 |
50.51 |
49.49 |
| cluster2 |
0 |
1 |
0.00 |
100.00 |
_ompleta
# ·····················································
completo_cluster1 <- sample_cluster_sinTFF %>% filter (jer_complete == '1')
cluster1 <- table(completo_cluster1$diagnosis)
completo_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
completo_cluster2 <- sample_cluster_sinTFF %>% filter (jer_complete == '2')
cluster2 <- table(completo_cluster2$diagnosis)
completo_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(completo_cluster.1,completo_cluster.2)))
| cluster1 |
50 |
43 |
53.76 |
46.24 |
| cluster2 |
0 |
7 |
0.00 |
100.00 |
Simple
# ·····················································
simple_cluster1 <- sample_cluster_sinTFF %>% filter (jer_single == '1')
cluster1 <- table(simple_cluster1$diagnosis)
simple_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
simple_cluster2 <- sample_cluster_sinTFF %>% filter (jer_single == '2')
cluster2 <- table(simple_cluster2$diagnosis)
simple_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(simple_cluster.1,simple_cluster.2)))
| cluster1 |
50 |
49 |
50.51 |
49.49 |
| cluster2 |
0 |
1 |
0.00 |
100.00 |
CLUSTERING JERÁRQUICO 2 sin REG1B ni creatinina
# mismo subset
cantidad_clusters=2
set.seed(1407)
par(mfcol = c(1,1))
data_c_diag = data[-c(2,8)] # sin sexo ni kmeans columns
sample_cluster1 <- data_c_diag[sample(1:nrow(data_c_diag), 100,replace=FALSE),]
sample_cluster <- as.data.frame(scale(sample_cluster1[,2:6]))
sample_cluster$diagnosis <- sample_cluster1$diagnosis
sample_cluster_sinTFFniC <- sample_cluster%>%dplyr::select(6,1,3,5) # aca le saco RGB1 y creatinina
# Matriz de distancias manhattan
mat_dist <- dist(x = sample_cluster[-1], method = "manhattan")
## Warning in dist(x = sample_cluster[-1], method = "manhattan"): NAs introduced by
## coercion
# Dendrogramas (según el tipo de segmentación jerárquica aplicada)
hc_complete <- hclust(d = mat_dist, method = "complete")
hc_average <- hclust(d = mat_dist, method = "average")
hc_single <- hclust(d = mat_dist, method = "single")
hc_ward <- hclust(d = mat_dist, method = "ward.D2")
# Calculo del coeficiente de correlacion cofenetico
completo <- round(cor(x = mat_dist, cophenetic(hc_complete)),3)
promedio <- round(cor(x = mat_dist, cophenetic(hc_average)),3)
simple <- round(cor(x = mat_dist, cophenetic(hc_single)),3)
ward <- round(cor(x = mat_dist, cophenetic(hc_ward)),3)
valores_coef <- cbind(completo,promedio,simple,ward)
# Imprimo valores de coeficiente cofenético
kable(valores_coef)
# Armo clusters
jer_ward<-cutree(hc_ward,k=cantidad_clusters)
jer_average<-cutree(hc_average,k=cantidad_clusters)
jer_complete<-cutree(hc_complete,k=cantidad_clusters)
jer_single<-cutree(hc_single,k=cantidad_clusters)
# Agrego info a data frame
sample_cluster_sinTFFniC$jer_ward=jer_ward
sample_cluster_sinTFFniC$jer_average=jer_average
sample_cluster_sinTFFniC$jer_complete=jer_complete
sample_cluster_sinTFFniC$jer_single=jer_single
Construcción de dendograma con distancia Ward
# construccion de dendogramas
mar = c(5.1, 4.1, 4.1, 2.1)
pch=c('royalblue2','#ff7474ff')
cols=alpha(pch[sample_cluster_sinTFFniC$diagnosis[order.dendrogram(as.dendrogram(hc_ward))]],0.7)
dend_ward <- color_branches(as.dendrogram(hc_ward), k = 2)
dend_ward <- set(dend_ward, "labels_cex", 0.1)
grafico1 <- dend_ward %>% set("leaves_pch",19)%>%
set("leaves_cex", .9) %>% # node point size
set("leaves_col", cols) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.8, 'Distancia Ward')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(85,40, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia Promedio
cols_a=alpha(pch[sample_cluster_sinTFFniC$diagnosis[order.dendrogram(as.dendrogram(hc_average))]],0.7)
dend_average <- color_branches(as.dendrogram(hc_average), k = 2)
dend_average <- set(dend_average, "labels_cex", 0.1)
grafico2 <- dend_average %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_a) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.3, 'Distancia Promedio')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(90,14.2, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia Completa
cols_c=alpha(pch[sample_cluster_sinTFFniC$diagnosis[order.dendrogram(as.dendrogram(hc_complete))]],0.7)
dend_complete <- color_branches(as.dendrogram(hc_complete), k = 2)
dend_complete <- set(dend_complete, "labels_cex", 0.1)
grafico3 <- dend_complete %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_c) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.6)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.3, 'Distancia Completa')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(5,18, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Construcción de dendograma con distancia Simple
cols_s=alpha(pch[sample_cluster_sinTFFniC$diagnosis[order.dendrogram(as.dendrogram(hc_single))]],0.7)
dend_single <- color_branches(as.dendrogram(hc_single), k = 2)
dend_single <- set(dend_single, "labels_cex", 0.1)
grafico4 <- dend_single %>% set("leaves_pch",19) %>% # node point type
set("leaves_cex", .8) %>% # node point size
set("leaves_col", cols_s) %>% #node point color
plot(main = "Dendrograma jerárquico", ylab='Distancia',cex.lab=1, cex.axis=.7)+
mtext(side = 3, line = 0.5, at = 1, adj = -1.7, 'Distancia Simple')+
mtext(side = 1, line = 0.5, at = 1, adj = -4.1, 'Paciente')
legend(80,7, title='Diagnóstico',
legend = c("normal" , "maligno"),
col = c('royalblue2','#ff7474ff') ,
pch = c(19,19), bty = "n", pt.cex = 1.5, cex = 0.8 ,
text.col = "black", horiz = FALSE, inset = c(0, 0.1))

Ward
# ·····················································
# cuántos pacientes de cada diagnóstico están en cada cluster:
ward_cluster1 <- sample_cluster_sinTFFniC %>% filter (jer_ward == '1')
cluster1 <- table(ward_cluster1$diagnosis)
Ward_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
ward_cluster2 <- sample_cluster_sinTFFniC %>% filter (jer_ward == '2')
cluster2 <- table(ward_cluster2$diagnosis)
Ward_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(Ward_cluster.1,Ward_cluster.2)))
| cluster1 |
50 |
43 |
53.76 |
46.24 |
| cluster2 |
0 |
7 |
0.00 |
100.00 |
Promedio
# ·····················································
promedio_cluster1 <- sample_cluster_sinTFFniC %>% filter (jer_average == '1')
cluster1 <- table(promedio_cluster1$diagnosis)
promedio_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
promedio_cluster2 <- sample_cluster_sinTFFniC %>% filter (jer_average == '2')
cluster2 <- table(promedio_cluster2$diagnosis)
promedio_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(promedio_cluster.1,promedio_cluster.2)))
| cluster1 |
50 |
49 |
50.51 |
49.49 |
| cluster2 |
0 |
1 |
0.00 |
100.00 |
Completa
# ·····················································
completo_cluster1 <- sample_cluster_sinTFFniC %>% filter (jer_complete == '1')
cluster1 <- table(completo_cluster1$diagnosis)
completo_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
completo_cluster2 <- sample_cluster_sinTFFniC %>% filter (jer_complete == '2')
cluster2 <- table(completo_cluster2$diagnosis)
completo_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(completo_cluster.1,completo_cluster.2)))
| cluster1 |
50 |
43 |
53.76 |
46.24 |
| cluster2 |
0 |
7 |
0.00 |
100.00 |
Simple
# ·····················································
simple_cluster1 <- sample_cluster_sinTFFniC %>% filter (jer_single == '1')
cluster1 <- table(simple_cluster1$diagnosis)
simple_cluster.1 <- round(prop.table(cluster1)*100,2)
# ·····················································
simple_cluster2 <- sample_cluster_sinTFFniC %>% filter (jer_single == '2')
cluster2 <- table(simple_cluster2$diagnosis)
simple_cluster.2 <- round(prop.table(cluster2)*100,2)
kable(cbind(rbind(cluster1,cluster2),rbind(simple_cluster.1,simple_cluster.2)))
| cluster1 |
50 |
49 |
50.51 |
49.49 |
| cluster2 |
0 |
1 |
0.00 |
100.00 |
TSNE
tsne_data <- data
tsne_data <- tsne_data%>% dplyr::select(c(1:7))
tsne_data$diagnosis<-as.factor(tsne_data$diagnosis)
Labels<-tsne_data$diagnosis
colors = c('royalblue2','#ff7474ff')
names(colors) = unique(tsne_data$diagnosis)
## Ejecuto algoritmo (escondo resultado de iteraciones)
set.seed(1409)
tsne <- Rtsne(tsne_data[,-1], dims = 2, perplexity=12, verbose=TRUE, max_iter = 5000)
set.seed(1409)
tsne1 <- Rtsne(tsne_data[,-1], dims = 2, perplexity=5, verbose=TRUE, max_iter = 5000)
set.seed(1409)
tsne2 <- Rtsne(tsne_data[,-1], dims = 2, perplexity=50, verbose=TRUE, max_iter = 5000)
tsne_df <- as.data.frame(tsne$Y)
tsne_df <- cbind(tsne_df,as.data.frame(Labels))
tsne_df1 <- as.data.frame(tsne1$Y)
tsne_df1 <- cbind(tsne_df1,as.data.frame(Labels))
tsne_df2 <- as.data.frame(tsne2$Y)
tsne_df2 <- cbind(tsne_df2,as.data.frame(Labels))
t1 <- ggplot(data=tsne_df1, aes(x=V1, y=V2))+geom_point(aes(color = Labels),alpha = 0.5)+
theme+labs(x='Dimensión 1', y=' Dimensión 2')+theme(legend.position = 'none')+
scale_color_manual(values=c('royalblue2','#ff7474ff'))+theme(plot.margin = unit(c(2.2,0.25,0.2,0.2), "cm"))
t3 <- ggplot(data=tsne_df2, aes(x=V1, y=V2))+geom_point(aes(color = Labels),alpha = 0.5)+
theme+labs(x='Dimensión 1', y=NULL, color='Diagnóstico')+theme(legend.position = c(1.36,.5))+
scale_color_manual(values=c('royalblue2','#ff7474ff'))+theme(plot.margin = unit(c(2.2,0.25,0.2,0.2), "cm"))
t2 <- ggplot(data=tsne_df, aes(x=V1, y=V2))+geom_point(aes(color = Labels),alpha = 0.5)+
theme+labs(x='Dimensión 1', y=NULL)+theme(legend.position = 'none')+
scale_color_manual(values=c('royalblue2','#ff7474ff'))+theme(plot.margin = unit(c(2.2,0.25,0.2,0.2), "cm")) #+
# geom_ellipse(aes(x0 = -22, y0 = -4, a = 19, b = 45, angle = 0.99),linetype=3, color='darkgray') +
#geom_ellipse(aes(x0 = 26, y0 = 12, a = 19, b = 42, angle = 0.75),linetype=3, color='darkgray')
plot_grid(t1, t2, t3, align = "h", ncol = 4, rel_widths = c(2.2,2,2,1))+
annotate("text", x=.46, y=.9, size=5, label='atop(bold("Análisis de TSNE"),"")', parse=TRUE)+
annotate("text", x=.18, y=.82, size=3.5, label='atop(italic("perplexity= 5"),"")', parse=TRUE)+
annotate("text", x=.46, y=.82, size=3.5, label='atop(italic("perplexity= 12"),"")', parse=TRUE)+
annotate("text", x=.735, y=.82, size=3.5, label='atop(italic("perplexity= 50"),"")', parse=TRUE)

TSNE con datos escalados
tsne_data2 <- datos_escalados
tsne_data2 <- tsne_data2%>% dplyr::select(c(1:7))
tsne_data2$diagnosis<-as.factor(tsne_data2$diagnosis)
Labels<-tsne_data2$diagnosis
colors = c('royalblue2','#ff7474ff')
names(colors) = unique(tsne_data2$diagnosis)
## Ejecuto algoritmo (escondo resultado de iteraciones)
set.seed(1409)
tsne <- Rtsne(tsne_data2[,-1], dims = 2, perplexity=5, verbose=TRUE, max_iter = 5000)
set.seed(1409)
tsne1 <- Rtsne(tsne_data2[,-1], dims = 2, perplexity=10, verbose=TRUE, max_iter = 5000)
set.seed(1409)
tsne2 <- Rtsne(tsne_data2[,-1], dims = 2, perplexity=15, verbose=TRUE, max_iter = 5000)
set.seed(1409)
tsne3 <- Rtsne(tsne_data2[,-1], dims = 2, perplexity=20, verbose=TRUE, max_iter = 5000)
set.seed(1409)
tsne4 <- Rtsne(tsne_data2[,-1], dims = 2, perplexity=25, verbose=TRUE, max_iter = 5000)
tsne_df <- as.data.frame(tsne$Y)
tsne_df <- cbind(tsne_df,as.data.frame(Labels))
tsne_df1 <- as.data.frame(tsne1$Y)
tsne_df1 <- cbind(tsne_df1,as.data.frame(Labels))
tsne_df2 <- as.data.frame(tsne2$Y)
tsne_df2 <- cbind(tsne_df2,as.data.frame(Labels))
tsne_df3 <- as.data.frame(tsne3$Y)
tsne_df3 <- cbind(tsne_df3,as.data.frame(Labels))
tsne_df4 <- as.data.frame(tsne4$Y)
tsne_df4 <- cbind(tsne_df4,as.data.frame(Labels))
t1 <- ggplot(data=tsne_df1, aes(x=V1, y=V2))+geom_point(aes(color = Labels),alpha = 0.5)+
theme+labs(x='Dimensión 1', y=' Dimensión 2')+theme(legend.position = 'none')+
scale_color_manual(values=c('royalblue2','#ff7474ff'))+theme(plot.margin = unit(c(2.4,0.25,0.2,0.2), "cm"))
t3 <- ggplot(data=tsne_df2, aes(x=V1, y=V2))+geom_point(aes(color = Labels),alpha = 0.5)+
theme+labs(x='Dimensión 1', y=NULL, color='Diagnóstico')+theme(legend.position = 'none')+
scale_color_manual(values=c('royalblue2','#ff7474ff'))+theme(plot.margin = unit(c(2.4,0.25,0.2,0.2), "cm"))
t2 <- ggplot(data=tsne_df, aes(x=V1, y=V2))+geom_point(aes(color = Labels),alpha = 0.5)+
theme+labs(x='Dimensión 1', y=NULL)+theme(legend.position = 'none')+
scale_color_manual(values=c('royalblue2','#ff7474ff'))+theme(plot.margin = unit(c(2.4,0.25,0.2,0.2), "cm"))
t4 <- ggplot(data=tsne_df3, aes(x=V1, y=V2))+geom_point(aes(color = Labels),alpha = 0.5)+
theme+labs(x='Dimensión 1', y=NULL)+theme(legend.position = 'none')+
scale_color_manual(values=c('royalblue2','#ff7474ff'))+theme(plot.margin = unit(c(2.4,0.25,0.2,0.2), "cm"))
t5 <- ggplot(data=tsne_df4, aes(x=V1, y=V2))+geom_point(aes(color = Labels),alpha = 0.5)+
theme+labs(x='Dimensión 1', y=NULL)+theme(legend.position = c(1.5,0.5))+
scale_color_manual(values=c('royalblue2','#ff7474ff'))+theme(plot.margin = unit(c(2.4,0.25,0.2,0.2), "cm"))
plot_grid(t1, t2, t3, t4, t5, align = "h", ncol = 6, rel_widths = c(1,1,1,1,1,0.6)) +
annotate("text", x=.46, y=.9, size=5, label='atop(bold("Análisis de TSNE"),"")', parse=TRUE)+
annotate("text", x=.46, y=.87, size=4, label='atop("datos estandarizados")', parse=TRUE)+
annotate("text", x=.11, y=.79, size=3.5, label='atop(italic("perplexity= 5"),"")', parse=TRUE)+
annotate("text", x=.285, y=.79, size=3.5, label='atop(italic("perplexity= 10"),"")', parse=TRUE)+
annotate("text", x=.46, y=.79, size=3.5, label='atop(italic("perplexity= 15"),"")', parse=TRUE)+
annotate("text", x=.64, y=.79, size=3.5, label='atop(italic("perplexity= 20"),"")', parse=TRUE)+
annotate("text", x=.82, y=.79, size=3.5, label='atop(italic("perplexity= 25"),"")', parse=TRUE)
